Load required packages and read in combined data.
#packages
pacman::p_load(dplyr,
tidyr,
ggplot2,
rjson,
rdatacite,
cowplot,
stringr,
knitr,
DT)
#Load the combined data from 3_Combined_data.R
load(file="data_rdata_files/Combined_ALL_data.Rdata")
#subset the data to published years >= 2012
all_dois <- combined_dois %>%
filter(publicationYear >= 2012)
Look at dois by their origin (all types)
all_dois %>%
group_by(group) %>%
summarize(count=n()) %>%
kable()
| group | count |
|---|---|
| Affiliation - CrossRef | 147702 |
| Affiliation - Datacite | 51053 |
| IR_publisher | 24104 |
General data cleaning
#DRUM is inconsistently specified (with and without DRUM)
all_dois$publisher[grep("Data Repository for the University of Minnesota", all_dois$publisher)] <- "Data Repository for the University of Minnesota (DRUM)"
#Remove morphosource data, as affiliation isn't included
all_dois2 <- all_dois[-which(all_dois$publisher_plus == "Duke-MorphoSource Media"),]
#make sure dataset is capitalized in all metadata resource types
all_dois2[which(all_dois2$resourceTypeGeneral == "dataset"),]$resourceTypeGeneral <- "Dataset"
Look at all the Institutional Repositories Captured
IR_pubs <- all_dois2 %>%
filter(group == "IR_publisher") %>%
group_by(publisher_plus) %>%
summarize(count = n())
IR_pubs %>%
kable(col.names = c("Institutional Repository", "Count"))
| Institutional Repository | Count |
|---|---|
| Cornell | 4758 |
| Duke-Duke Digital Repository | 76 |
| Duke-Research Data Repository, Duke University | 147 |
| Michigan | 10 |
| Michigan-Deep Blue | 637 |
| Michigan-ICPSR/ISR | 109 |
| Michigan-Other | 57 |
| Minnesota | 692 |
| Virginia Tech | 333 |
| Washington U | 4085 |
Replace all of these publishers with “Institutional Repository” so that they will be represented in a single bar.
all_dois2$publisher[which(all_dois2$publisher_plus %in% unique(IR_pubs$publisher_plus))] <- "Institutional Repository"
#catch the rest of the "Cornell University Library"
all_dois2$publisher[which(all_dois2$publisher == "Cornell University Library")] <- "Institutional Repository"
#and stray VT
all_dois2$publisher[which(all_dois2$publisher == "University Libraries, Virginia Tech")] <- "Institutional Repository"
#and DRUM
all_dois2$publisher[which(all_dois2$publisher == "Data Repository for the University of Minnesota (DRUM)")] <- "Institutional Repository"
##ICPSR is also inconsistent
all_dois2$publisher[grep("Consortium for Political", all_dois$publisher)] <- "ICPSR"
Counts by resource type
by_resource <- all_dois2 %>%
group_by(institution, resourceTypeGeneral) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
Create a table of top resources
by_resource_table <- by_resource %>%
#filter(resourceTypeGeneral %in% c("Dataset", "Software", "Text", "Image")) %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_resource_table %>%
datatable
Write out the resources
write.csv(by_resource_table, file = "data_summary_data/Counts of Resource Types by Insitution.csv", row.names = F)
Subset to only datasets
data_dois <- all_dois2 %>%
filter(resourceTypeGeneral == "Dataset")
Data DOIs by publisher
by_publisher_data <- data_dois %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
by_publisher_data_table <- by_publisher_data %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_publisher_data_table %>%
datatable()
Write out the table of data publishers
write.csv(by_publisher_data_table, file="data_summary_data/Counts of Data Publishers By Insitituion.csv", row.names = F)
After reviewing the repositories, we will remove the Faculty Opinions LTD records (these are reviews of articles, no data attached) and the ENCODE Data Coordination Center (extreme outlier for Michigan in 2022, unclear what the level of DOI assignment is) from further analysis.
Subset to only software (only datacite has software)
software_dois <- all_dois2 %>%
filter(resourceTypeGeneral == "Software")
by_publisher_software <- software_dois %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
by_publisher_software_table <- by_publisher_software %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_publisher_software_table %>%
datatable
Write out the table of software publishers
write.csv(by_publisher_software_table, file="data_summary_data/Counts of Software Publishers By Insitituion.csv", row.names = F)
Plot publishers by rank, ordered from most DOIs to least (take top 20). Remove Encode and Faculty opinions LTD from the list.
by_publisher_data <- by_publisher_data %>%
filter(publisher != "ENCODE Data Coordination Center",
publisher != "Faculty Opinions Ltd")
by_publisher_data_table <- by_publisher_data_table %>%
filter(publisher != "ENCODE Data Coordination Center",
publisher != "Faculty Opinions Ltd")
by_publisher_data %>%
group_by(publisher) %>%
summarize(count=sum(count)) %>%
arrange(desc(count)) %>%
mutate(pubrank = order(count, decreasing = T)) %>%
ggplot(aes(x=pubrank, y=count)) +
geom_bar(stat="identity") +
scale_x_continuous(limits = c(0,20), n.breaks = 20) +
labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers")+
coord_cartesian(xlim = c(1,20)) +
theme_bw()
Based on the graph above, it appears that there is a large drop off after the top 7 publishers. If we look at the top 7 publishers for the data dois, how many DOIs does this cover?
top7pubs <- by_publisher_data_table$publisher[1:7]
by_publisher_data %>%
group_by(publisher) %>%
summarize(count=sum(count)) %>%
mutate(intop7pub = publisher %in% top7pubs) %>%
group_by(intop7pub) %>%
summarize(totalDOIs = sum(count), nrepos = n()) %>%
ungroup() %>%
mutate(propDOIs = totalDOIs/sum(totalDOIs)) %>%
kable(col.names = c("In Top 7 Publishers", "Total N DOIs", "Total N Publishers", "Proportion of Total DOIs"))
| In Top 7 Publishers | Total N DOIs | Total N Publishers | Proportion of Total DOIs |
|---|---|---|---|
| FALSE | 2274 | 159 | 0.0779461 |
| TRUE | 26900 | 7 | 0.9220539 |
Plotting Number of DOIs in the top 8 publishers by institution
top7colors <- c("Harvard Dataverse" = "dodgerblue2",
"Zenodo" = "darkorange1",
"ICPSR" = "darkcyan",
"Dryad" = "lightgray",
"Qualitative Data Repository" = "gold1",
"figshare" = "purple",
"Institutional Repository" = "lightblue")
(by_publisher_data_plot <- by_publisher_data %>%
filter(publisher %in% top7pubs) %>%
ggplot(aes(x=institution, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
scale_fill_manual(values = top7colors, name="Publisher")+
guides(fill = guide_legend(title.position = "top")) +
scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
coord_cartesian(ylim = c(0,5000)) +
labs(x = "Institution", y="Count of Data DOIs", caption = "Note: Michigan Dataverse bar cutoff for scaling") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5))
ggsave(by_publisher_data_plot, filename = "figures/Counts of Data DOIs by Institution - ForPaper.png", device = "png", width = 8, height = 6, units="in")
Look at the top software publishers (This excludes CrossRef affiliation data, as software is not a resource type).
by_publisher_software %>%
group_by(publisher) %>%
summarize(count=sum(count)) %>%
arrange(desc(count)) %>%
mutate(pubrank = order(count, decreasing = T)) %>%
ggplot(aes(x=pubrank, y=count)) +
geom_bar(stat="identity") +
scale_x_continuous(limits = c(0,20), n.breaks = 20) +
labs(x = "Publisher Rank", y="Number of DOIs", title="Number of Software DOIs by top Publishers")+
coord_cartesian(xlim = c(1,20)) +
theme_bw()
It looks like there is one primary software publisher, but we could also take the top 4 or 5 capture the majority.
top6pubs_soft <- by_publisher_software_table$publisher[1:6]
top6colors_soft <- c("Zenodo" = "darkorange1",
"Code Ocean" = "darkblue",
"Institutional Repository" = "lightblue",
"Optica Publishing Group" = "red",
"CoMSES Net" = "pink",
"figshare" = "purple")
(by_publisher_software_plot <- by_publisher_software %>%
filter(publisher %in% top6pubs_soft) %>%
ggplot(aes(x=institution, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
scale_fill_manual(values = top6colors_soft, name="Publisher")+
guides(fill = guide_legend(title.position = "top")) +
labs(x = "Institution", y="Count of Software DOIs") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5))
ggsave(by_publisher_software_plot, filename = "figures/Counts of Software DOIs by Institution.png", device = "png", width = 8, height = 6, units="in")
Some repositories (such as Harvard’s Dataverse and Qualitative Data Repository) assign DOIs at the level of the file, rather than the study. Similarly, Zenodo often has many related DOIs for multiple figures within a study. In order to attempt to compare study-to-study counts of data sharing, look at the DOIs collapsed by “container”.
by_container <-
all_dois2 %>%
filter(!is.na(container_identifier)) %>%
group_by(container_identifier, publisher, title, institution) %>%
summarize(count=n()) %>%
arrange(desc(count))
How many publishers have container DOIs?
by_container %>%
group_by(publisher) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
datatable
Collapsing by container for counts
containerdups <- which(!is.na(all_dois2$container_identifier) & duplicated(all_dois2$container_identifier))
all_dois_collapsed <- all_dois2[-containerdups,]
Faculty Opinions LTD and ENCODE Data Coordination Center are removed from this analysis as well.
data_dois_collapse <- all_dois_collapsed %>%
filter(resourceTypeGeneral == "Dataset") %>%
filter(publisher != "ENCODE Data Coordination Center",
publisher != "Faculty Opinions Ltd")
by_publisher_data_collapse <- data_dois_collapse %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
Table of publisher counts
by_publisher_data_collapse_table <- by_publisher_data_collapse %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
by_publisher_data_collapse_table %>%
datatable
Write out the table of data publishers
write.csv(by_publisher_data_collapse_table, file="data_summary_data/Counts of Data Publishers By Insitituion - Collapsed by container.csv", row.names = F)
by_publisher_data_dc_collapse <- data_dois_collapse %>%
group_by(publisher, institution) %>%
summarize(count=n()) %>%
arrange(institution, desc(count))
#table of publishers - data
by_publisher_data_dc_collapse_table <- by_publisher_data_dc_collapse %>%
pivot_wider(names_from = institution,
values_from = count,
values_fill = 0) %>%
rowwise %>%
mutate(Total = sum(c_across(Cornell:`Washington U`))) %>%
arrange(desc(Total))
Look at publishers based on rank of number of DOIs
by_publisher_data_dc_collapse_table %>%
group_by(publisher) %>%
summarize(count=sum(Total)) %>%
arrange(desc(count)) %>%
mutate(pubrank = order(count, decreasing = T)) %>%
ggplot(aes(x=pubrank, y=count)) +
geom_bar(stat="identity") +
scale_x_continuous(limits = c(0,25)) +
labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers")+
theme_bw()
Look at the top 7 publishers - how many does this capture?
top7pubs <- by_publisher_data_dc_collapse_table$publisher[1:7]
by_publisher_data_dc_collapse_table %>%
group_by(publisher) %>%
summarize(count=sum(Total)) %>%
mutate(intop7pub = publisher %in% top7pubs) %>%
group_by(intop7pub) %>%
summarize(totalDOIs = sum(count), nrepos = n()) %>%
ungroup() %>%
mutate(propDOIs = totalDOIs/sum(totalDOIs))
## # A tibble: 2 × 4
## intop7pub totalDOIs nrepos propDOIs
## <lgl> <int> <int> <dbl>
## 1 FALSE 1547 159 0.112
## 2 TRUE 12228 7 0.888
top7colors <- c("Harvard Dataverse" = "dodgerblue2",
"Zenodo" = "darkorange1",
"ICPSR" = "darkcyan",
"Dryad" = "lightgray",
"figshare" = "purple",
"Institutional Repository" = "lightblue",
"Taylor & Francis" = "gold2")
(by_publisher_data_plot_collapse <- by_publisher_data_dc_collapse %>%
filter(publisher %in% top7pubs) %>%
ggplot(aes(x=institution, y=count, fill=publisher)) +
geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
scale_fill_manual(values = top7colors, name="Publisher")+
guides(fill = guide_legend(title.position = "top")) +
#scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
#coord_cartesian(ylim = c(0,5000)) +
labs(x = "Institution", y="Count of Collapsed Data DOIs") +
theme_bw() +
theme(legend.position = "bottom", legend.title.align = .5))
ggsave(by_publisher_data_plot_collapse, filename = "figures/Counts of Data DOIs by Institution_DOIcollapsed.png", device = "png", width = 8, height = 6, units="in")
We can also look at the data collapsed by version of a record. This was motivated because some repositories have multiple entries for the different versions of the same dataset/collection. And some entries have many versions.
Explore versions
Some Repositories attach “vX” to the doi.
all_dois_collapsed <- all_dois_collapsed %>%
mutate(hasversion = grepl("\\.v[[:digit:]]+$", DOI))
all_dois_collapsed %>%
filter(hasversion == TRUE) %>%
group_by(publisher, hasversion) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
datatable()
Some repositories use the “VersionCount”
all_dois_collapsed %>%
filter(versionCount > 0) %>%
group_by(publisher) %>%
summarize(count=n(), AvgNversions = round(mean(versionCount),2)) %>%
arrange(desc(count)) %>%
datatable()
Some use “metadataVersion”
all_dois_collapsed %>%
filter(metadataVersion > 0) %>%
group_by(publisher) %>%
summarize(count=n(), AvgNversions = round(mean(metadataVersion),2)) %>%
arrange(desc(count)) %>%
datatable()
How to collapse by version? Maybe that’s for another day…
Write out CSV files for each institution:
for (i in unique(all_dois2$institution)) {
all_dois %>%
filter(institution == i) %>%
write.csv(file=paste0("data_all_dois/All_dois_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
all_dois_collapsed %>%
filter(institution == i) %>%
write.csv(file=paste0("data_all_dois/All_dois_collapsed_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
}